I created several linear regression models using temperature as the dependent variable.
temp_lowc <- lm(temperature ~ cloudlow,data = combine)
temp_lowc %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 277.7132586 2.7138125 102.33325 5.691620e-78
## 2 cloudlow 0.6569496 0.1328456 4.94521 5.022951e-06
lowcg <- ggplot(combine,aes(x=cloudlow,y=temperature))+
geom_point()+
xlab("Low Cloud Coverage")+ylab("Temperature")+
geom_abline(intercept=277.7133,slope=0.6569,col="red")
lowcg
temp_midc <- lm(temperature ~ cloudmid,data = combine)
temp_midc %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 313.376097 2.2739709 137.81008 5.642312e-87
## 2 cloudmid -1.167782 0.1110144 -10.51919 4.670486e-16
midcg <- ggplot(combine,aes(x=cloudmid,y=temperature))+
geom_point()+
xlab("Middle Cloud Coverage")+ylab("Temperature")+
geom_abline(intercept=313.376,slope=-1.168,col="red")
midcg
temp_highc <- lm(temperature ~ cloudhigh,data = combine)
temp_highc %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 298.8933145 1.7554467 170.266243 2.193430e-93
## 2 cloudhigh -0.8519088 0.1541152 -5.527739 5.221285e-07
highcg <- ggplot(combine,aes(x=cloudhigh,y=temperature))+
geom_point()+
xlab("High Cloud Coverage")+ylab("Temperature")+
geom_abline(intercept=298.8933,slope=-0.8519,col="red")
highcg
temp_ozone <- lm(temperature ~ ozone,data = combine)
temp_ozone %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 337.8292030 12.16953566 27.760238 1.634629e-39
## 2 ozone -0.1525832 0.03904494 -3.907888 2.124625e-04
ozoneg <- ggplot(combine,aes(x=ozone,y=temperature))+
geom_point()+
xlab("Ozone Level")+ylab("Temperature")+
geom_abline(intercept=337.8292,slope=-0.1526,col="red")
ozoneg
temp_surftemp <- lm(temperature ~ surftemp,data = combine)
temp_surftemp %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 86.0301279 10.9520922 7.855132 3.376424e-11
## 2 surftemp 0.7002365 0.0374963 18.674818 6.705231e-29
surfg <- ggplot(combine,aes(x=surftemp,y=temperature))+
geom_point()+
xlab("Surface Temperature")+ylab("Temperature")+
geom_abline(intercept=86.0301,slope=0.7002,col="red")
surfg
temp_pres <- lm(temperature ~ pressure,data = combine)
temp_pres %>% tidy() %>% as.data.frame()
## term estimate std.error statistic p.value
## 1 (Intercept) 259.84483680 20.06505232 12.950120 2.938691e-20
## 2 pressure 0.03408459 0.02234757 1.525203 1.317141e-01
presg <- ggplot(combine,aes(x=pressure,y=temperature))+
geom_point()+
xlab("Atmospheric Pressure")+ylab("Temperature")+
geom_abline(intercept=259.84484,slope=0.03408,col="red")
presg
Used to combine all graphs into one figure.
figure <- ggarrange(lowcg,midcg,highcg,ozoneg,surfg,presg ,ncol = 3,nrow=2)
figure
From the linear regressions, pressure was the only variable that did not correlate with temperature. Therefore, the multiple linear regression model will not use that variable for predictions.
model <- lm(temperature ~ cloudlow+cloudmid+cloudhigh+ozone+surftemp,data=combine)
summary(model)
##
## Call:
## lm(formula = temperature ~ cloudlow + cloudmid + cloudhigh +
## ozone + surftemp, data = combine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7001 -1.7232 -0.0064 1.7982 4.8737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.43522 22.89516 0.893 0.375
## cloudlow -0.65061 0.10505 -6.194 4.27e-08 ***
## cloudmid 0.16998 0.10875 1.563 0.123
## cloudhigh -0.43951 0.08269 -5.315 1.35e-06 ***
## ozone 0.01669 0.02003 0.833 0.408
## surftemp 0.95383 0.06855 13.915 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.427 on 66 degrees of freedom
## Multiple R-squared: 0.9261, Adjusted R-squared: 0.9205
## F-statistic: 165.3 on 5 and 66 DF, p-value: < 2.2e-16
Chose 50 random data points from the NASA data set (some listed below):
temp_pred <- sample_n(dfnasa,50)
head(temp_pred)
## lat long month year cloudhigh cloudlow cloudmid ozone
## 1 11.243478 -91.26087 2 1996 2.5 11.0 4.5 254
## 2 6.252174 -68.72174 8 1996 31.0 13.5 26.5 262
## 3 26.217391 -108.79130 6 1995 8.0 13.0 12.0 274
## 4 11.243478 -56.20000 11 1997 5.5 27.5 9.0 254
## 5 18.730435 -76.23478 10 1995 22.0 13.5 11.0 262
## 6 -13.713043 -96.26957 11 1999 0.0 57.0 10.5 274
## pressure surftemp temperature
## 1 1000 299.2 299.6
## 2 980 298.3 300.1
## 3 875 301.9 299.2
## 4 1000 302.3 301.4
## 5 1000 301.0 302.3
## 6 1000 292.2 293.6
Data frame of 50 random rows from the NASA data set.
model_usage <- temp_pred %>% select(cloudhigh,cloudlow,cloudmid,ozone,surftemp)
real_temp <- temp_pred %>% select(temperature)
head(model_usage)
## cloudhigh cloudlow cloudmid ozone surftemp
## 1 2.5 11.0 4.5 254 299.2
## 2 31.0 13.5 26.5 262 298.3
## 3 8.0 13.0 12.0 274 301.9
## 4 5.5 27.5 9.0 254 302.3
## 5 22.0 13.5 11.0 262 301.0
## 6 0.0 57.0 10.5 274 292.2
The model_usage variable was used to find the prediction while storing the actual temperature in real_temp.
model_predictions <- model_usage %>% add_predictions(model)
head(model_predictions)
## cloudhigh cloudlow cloudmid ozone surftemp pred
## 1 2.5 11.0 4.5 254 299.2 302.5689
## 2 31.0 13.5 26.5 262 298.3 291.4312
## 3 8.0 13.0 12.0 274 301.9 303.0344
## 4 5.5 27.5 9.0 254 302.3 294.2372
## 5 22.0 13.5 11.0 262 301.0 295.3273
## 6 0.0 57.0 10.5 274 292.2 268.4166
actual_preddf <- data.frame(cbind(real_temp, model_predictions$pred))
colnames(actual_preddf) = c("real","prediction")
ggplotly(ggplot(actual_preddf)+
geom_point(aes(x=real,y=prediction))+
geom_abline(intercept=0,slope=1,col="darkturquoise",size=1)+
xlab("Real Temperature")+ylab("Predicted Temperature"))